{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_data_folder = \"../experiments/labeled_logs_csv/\"\n",
    "output_data_folder = \"../experiments/logdata/\"\n",
    "filename = \"Production_Data.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "case_id_col = \"Case ID\"\n",
    "activity_col = \"Activity\"\n",
    "resource_col = \"Resource\"\n",
    "timestamp_col = \"Complete Timestamp\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "freq_threshold = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# features for classifier\n",
    "static_cat_cols = [\"Part Desc\"]\n",
    "static_num_cols = [\"Work Order Qty\"]\n",
    "dynamic_cat_cols = [activity_col, resource_col, \"Report Type\", \"Worker ID\"]\n",
    "dynamic_num_cols = [\"Qty Completed\", \"Qty for MRB\", \"activity_duration\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "static_cols = static_cat_cols + static_num_cols + [case_id_col]\n",
    "dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]\n",
    "cat_cols = dynamic_cat_cols + static_cat_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_timestamp_features(group):\n",
    "    \n",
    "    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')\n",
    "    \n",
    "    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)\n",
    "    tmp = tmp.fillna(0)\n",
    "    group[\"timesincelastevent\"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))\n",
    "\n",
    "    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]\n",
    "    tmp = tmp.fillna(0)\n",
    "    group[\"timesincecasestart\"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))\n",
    "    \n",
    "    tmp = group[timestamp_col].iloc[0] - group[timestamp_col]\n",
    "    tmp = tmp.fillna(0)\n",
    "    group[\"remtime\"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))\n",
    "\n",
    "    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')\n",
    "    group[\"event_nr\"] = range(1, len(group) + 1)\n",
    "    \n",
    "    return group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_open_cases(date):\n",
    "    return sum((dt_first_last_timestamps[\"start_time\"] <= date) & (dt_first_last_timestamps[\"end_time\"] > date))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(os.path.join(input_data_folder,filename), sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add event duration\n",
    "data[\"Complete Timestamp\"] = pd.to_datetime(data[\"Complete Timestamp\"])\n",
    "data[\"Start Timestamp\"] = pd.to_datetime(data[\"Start Timestamp\"])\n",
    "tmp = data[\"Complete Timestamp\"] - data[\"Start Timestamp\"]\n",
    "tmp = tmp.fillna(0)\n",
    "data[\"activity_duration\"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def assign_label(group):\n",
    "    tmp = group[\"Qty Rejected\"] > 0\n",
    "    tmp = tmp.reset_index()[\"Qty Rejected\"]\n",
    "    if sum(tmp) > 0:\n",
    "        idx = tmp[tmp==True].index[0]\n",
    "        group = group.iloc[:idx,:]\n",
    "        group[label_col] = pos_label\n",
    "    else:\n",
    "        group[label_col] = neg_label\n",
    "    return group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# assign labels\n",
    "label_col = \"label\"\n",
    "pos_label = \"deviant\"\n",
    "neg_label = \"regular\"\n",
    "#data = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col).apply(assign_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[static_cols + dynamic_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Part Desc</th>\n",
       "      <th>Work Order Qty</th>\n",
       "      <th>Case ID</th>\n",
       "      <th>Activity</th>\n",
       "      <th>Resource</th>\n",
       "      <th>Report Type</th>\n",
       "      <th>Worker ID</th>\n",
       "      <th>Qty Completed</th>\n",
       "      <th>Qty for MRB</th>\n",
       "      <th>activity_duration</th>\n",
       "      <th>Complete Timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Cable Head</td>\n",
       "      <td>10</td>\n",
       "      <td>Case 1</td>\n",
       "      <td>Turning &amp; Milling - Machine 4</td>\n",
       "      <td>Machine 4 - Turning &amp; Milling</td>\n",
       "      <td>S</td>\n",
       "      <td>ID4932</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>22740.0</td>\n",
       "      <td>2012-01-30 05:43:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Cable Head</td>\n",
       "      <td>10</td>\n",
       "      <td>Case 1</td>\n",
       "      <td>Turning &amp; Milling - Machine 4</td>\n",
       "      <td>Machine 4 - Turning &amp; Milling</td>\n",
       "      <td>D</td>\n",
       "      <td>ID4932</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3480.0</td>\n",
       "      <td>2012-01-30 06:42:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Cable Head</td>\n",
       "      <td>10</td>\n",
       "      <td>Case 1</td>\n",
       "      <td>Turning &amp; Milling - Machine 4</td>\n",
       "      <td>Machine 4 - Turning &amp; Milling</td>\n",
       "      <td>S</td>\n",
       "      <td>ID4167</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1320.0</td>\n",
       "      <td>2012-01-30 07:21:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Part Desc  Work Order Qty Case ID                       Activity  \\\n",
       "0  Cable Head              10  Case 1  Turning & Milling - Machine 4   \n",
       "1  Cable Head              10  Case 1  Turning & Milling - Machine 4   \n",
       "2  Cable Head              10  Case 1  Turning & Milling - Machine 4   \n",
       "\n",
       "                        Resource Report Type Worker ID  Qty Completed  \\\n",
       "0  Machine 4 - Turning & Milling           S    ID4932              1   \n",
       "1  Machine 4 - Turning & Milling           D    ID4932              1   \n",
       "2  Machine 4 - Turning & Milling           S    ID4167              0   \n",
       "\n",
       "   Qty for MRB  activity_duration  Complete Timestamp  \n",
       "0            0            22740.0 2012-01-30 05:43:00  \n",
       "1            0             3480.0 2012-01-30 06:42:00  \n",
       "2            0             1320.0 2012-01-30 07:21:00  "
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add features extracted from timestamp\n",
    "data[\"weekday\"] = data[timestamp_col].dt.weekday\n",
    "data[\"hour\"] = data[timestamp_col].dt.hour\n",
    "data = data.groupby(case_id_col).apply(extract_timestamp_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/coderus/anaconda2/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: 'Case ID' is both an index level and a column label.\n",
      "Defaulting to column, but this will raise an ambiguity error in a future version\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "# add inter-case features\n",
    "data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')\n",
    "dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])\n",
    "dt_first_last_timestamps.columns = [\"start_time\", \"end_time\"]\n",
    "data[\"open_cases\"] = data[timestamp_col].apply(get_open_cases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/coderus/anaconda2/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: 'Case ID' is both an index level and a column label.\n",
      "Defaulting to column, but this will raise an ambiguity error in a future version\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "# impute missing values\n",
    "grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)\n",
    "for col in static_cols + dynamic_cols:\n",
    "    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))\n",
    "\n",
    "data[cat_cols] = data[cat_cols].fillna('missing')\n",
    "data = data.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set infrequent factor levels to \"other\"\n",
    "for col in cat_cols:\n",
    "    counts = data[col].value_counts()\n",
    "    mask = data[col].isin(counts[counts >= freq_threshold].index)\n",
    "    data.loc[~mask, col] = \"other\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(os.path.join(output_data_folder,filename), sep=\";\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
